{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "%matplotlib inline"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n==============================================\nFeature agglomeration vs. univariate selection\n==============================================\n\nThis example compares 2 dimensionality reduction strategies:\n\n- univariate feature selection with Anova\n\n- feature agglomeration with Ward hierarchical clustering\n\nBoth methods are compared in a regression problem using\na BayesianRidge as supervised estimator.\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>\n# License: BSD 3 clause\n\nprint(__doc__)\n\nimport shutil\nimport tempfile\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom scipy import linalg, ndimage\nfrom joblib import Memory\n\nfrom sklearn.feature_extraction.image import grid_to_graph\nfrom sklearn import feature_selection\nfrom sklearn.cluster import FeatureAgglomeration\nfrom sklearn.linear_model import BayesianRidge\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.model_selection import KFold\n\n# #############################################################################\n# Generate data\nn_samples = 200\nsize = 40  # image size\nroi_size = 15\nsnr = 5.\nnp.random.seed(0)\nmask = np.ones([size, size], dtype=np.bool)\n\ncoef = np.zeros((size, size))\ncoef[0:roi_size, 0:roi_size] = -1.\ncoef[-roi_size:, -roi_size:] = 1.\n\nX = np.random.randn(n_samples, size ** 2)\nfor x in X:  # smooth data\n    x[:] = ndimage.gaussian_filter(x.reshape(size, size), sigma=1.0).ravel()\nX -= X.mean(axis=0)\nX /= X.std(axis=0)\n\ny = np.dot(X, coef.ravel())\nnoise = np.random.randn(y.shape[0])\nnoise_coef = (linalg.norm(y, 2) / np.exp(snr / 20.)) / linalg.norm(noise, 2)\ny += noise_coef * noise  # add noise\n\n# #############################################################################\n# Compute the coefs of a Bayesian Ridge with GridSearch\ncv = KFold(2)  # cross-validation generator for model selection\nridge = BayesianRidge()\ncachedir = tempfile.mkdtemp()\nmem = Memory(location=cachedir, verbose=1)\n\n# Ward agglomeration followed by BayesianRidge\nconnectivity = grid_to_graph(n_x=size, n_y=size)\nward = FeatureAgglomeration(n_clusters=10, connectivity=connectivity,\n                            memory=mem)\nclf = Pipeline([('ward', ward), ('ridge', ridge)])\n# Select the optimal number of parcels with grid search\nclf = GridSearchCV(clf, {'ward__n_clusters': [10, 20, 30]}, n_jobs=1, cv=cv)\nclf.fit(X, y)  # set the best parameters\ncoef_ = clf.best_estimator_.steps[-1][1].coef_\ncoef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_)\ncoef_agglomeration_ = coef_.reshape(size, size)\n\n# Anova univariate feature selection followed by BayesianRidge\nf_regression = mem.cache(feature_selection.f_regression)  # caching function\nanova = feature_selection.SelectPercentile(f_regression)\nclf = Pipeline([('anova', anova), ('ridge', ridge)])\n# Select the optimal percentage of features with grid search\nclf = GridSearchCV(clf, {'anova__percentile': [5, 10, 20]}, cv=cv)\nclf.fit(X, y)  # set the best parameters\ncoef_ = clf.best_estimator_.steps[-1][1].coef_\ncoef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_.reshape(1, -1))\ncoef_selection_ = coef_.reshape(size, size)\n\n# #############################################################################\n# Inverse the transformation to plot the results on an image\nplt.close('all')\nplt.figure(figsize=(7.3, 2.7))\nplt.subplot(1, 3, 1)\nplt.imshow(coef, interpolation=\"nearest\", cmap=plt.cm.RdBu_r)\nplt.title(\"True weights\")\nplt.subplot(1, 3, 2)\nplt.imshow(coef_selection_, interpolation=\"nearest\", cmap=plt.cm.RdBu_r)\nplt.title(\"Feature Selection\")\nplt.subplot(1, 3, 3)\nplt.imshow(coef_agglomeration_, interpolation=\"nearest\", cmap=plt.cm.RdBu_r)\nplt.title(\"Feature Agglomeration\")\nplt.subplots_adjust(0.04, 0.0, 0.98, 0.94, 0.16, 0.26)\nplt.show()\n\n# Attempt to remove the temporary cachedir, but don't worry if it fails\nshutil.rmtree(cachedir, ignore_errors=True)"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.9"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}